06 - Facies Classifier

George Crowther

This is an extension / amalgamation of prior entries. The workflow remains not dissimilar to those completed previously, this is:

Load and set strings to integers
Cursory data examination, this workbook does not attempt to detail the full data analysis
Group data by well and brute force feature creation
- Feature creation focuses on bringing results from adjacent samples into features
- Look at some ratios between features
Leaving out two wells at a time, use TPOT to generate a pipeline for prediction.
Modal vote on fitted model predicting on the test data set.



In [1]:

    
import pandas as pd
import bokeh.plotting as bk
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor

import sys
sys.path.append('~/home/slygeorge/Documents/Python/SEG ML Competition')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

bk.output_notebook()









    



/home/slygeorge/anaconda3/envs/seg_competition/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    











    





    
        
        Loading BokehJS ...



In [98]:

    
from scipy.stats import mode
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFwe, SelectKBest, f_classif, SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, Binarizer, Normalizer, StandardScaler
from xgboost import XGBClassifier



In [115]:

    
models = [
    make_pipeline(
    MinMaxScaler(),
    XGBClassifier(learning_rate=0.02, max_depth=5, min_child_weight=20, n_estimators=500, subsample=0.19)
),
    make_pipeline(
    make_union(VotingClassifier([("est", LogisticRegression(C=0.13, dual=False, penalty="l1"))]), FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    Binarizer(threshold=0.72),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    make_union(VotingClassifier([("est", GradientBoostingClassifier(learning_rate=1.0, max_features=1.0, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    BernoulliNB(alpha=28.0, binarize=0.85, fit_prior=True)
),
    make_pipeline(
    Normalizer(norm="l1"),
    make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]), FunctionTransformer(lambda X: X)),
    SelectKBest(k=47, score_func=f_classif),
    SelectFwe(alpha=0.05, score_func=f_classif),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    make_union(VotingClassifier([("est", LinearSVC(C=0.26, dual=False, penalty="l2"))]), FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    Normalizer(norm="l2"),
    make_union(VotingClassifier([("est", ExtraTreesClassifier(criterion="entropy", max_features=0.3, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    GaussianNB()
),
    make_pipeline(
    make_union(VotingClassifier([("est", BernoulliNB(alpha=49.0, binarize=0.06, fit_prior=True))]), FunctionTransformer(lambda X: X)),
    StandardScaler(),
    make_union(VotingClassifier([("est", GradientBoostingClassifier(learning_rate=0.87, max_features=0.87, n_estimators=500))]), FunctionTransformer(lambda X: X)),
    ExtraTreesClassifier(criterion="entropy", max_features=0.001, n_estimators=500)
),
    make_pipeline(
    make_union(VotingClassifier([("est", RandomForestClassifier(n_estimators=500))]), FunctionTransformer(lambda X: X)),
    BernoulliNB(alpha=1e-06, binarize=0.09, fit_prior=True)
),
    make_pipeline(
    Normalizer(norm="max"),
    MinMaxScaler(),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    SelectPercentile(percentile=18, score_func=f_classif),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    SelectKBest(k=50, score_func=f_classif),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    XGBClassifier(learning_rate=0.51, max_depth=10, min_child_weight=20, n_estimators=500, subsample=1.0)
),
    make_pipeline(
    make_union(VotingClassifier([("est", KNeighborsClassifier(n_neighbors=5, weights="uniform"))]), FunctionTransformer(lambda X: X)),
    RandomForestClassifier(n_estimators=500)
),
    make_pipeline(
    StandardScaler(),
    SelectPercentile(percentile=19, score_func=f_classif),
    LinearSVC(C=0.02, dual=False, penalty="l1")
),
    make_pipeline(
    XGBClassifier(learning_rate=0.01, max_depth=10, min_child_weight=20, n_estimators=500, subsample=0.36)
)]



In [125]:

    
train_path = '../training_data.csv'
test_path = '../validation_data_nofacies.csv'

facies_labels = ['SS', 'CSiS', 'FSiS', 'SiSh', 'MS', 'WS', 'D', 'PS', 'BS']


def feature_extraction(file_path, retain_class = True):

    # Read training data to dataframe
    test = pd.read_csv(file_path)
    
    if 'Facies' in test.columns:
        test.rename(columns={'Facies': 'class'}, inplace=True)

    # Set string features to integers

    for i, value in enumerate(test['Formation'].unique()):
        test.loc[test['Formation'] == value, 'Formation'] = i

    for i, value in enumerate(test['Well Name'].unique()):
        test.loc[test['Well Name'] == value, 'Well Name'] = i

    # The first thing that will be done is to upsample and interpolate the training data,
    # the objective here is to provide significantly more samples to train the regressor on and
    # also to capture more of the sample interdependancy.
    upsampled_arrays = []
    test['orig_index'] = test.index

    # Use rolling windows through upsampled frame, grouping by well name.

    # Empty list to hold frames
    mean_frames = []
    above = []
    below = []

    for well, group in test.groupby('Well Name'):
        # Empty list to hold rolling frames
        constructor_list = []
        for f in resample_factors:

            working_frame = group[['Depth', 'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M',
           'RELPOS']]

            mean_frame = working_frame.rolling(window = f, center = True).mean().interpolate(method = 'index', limit_direction = 'both', limit = None)
            mean_frame.columns = ['Mean_{0}_{1}'.format(f, column) for column in mean_frame.columns]
            max_frame = working_frame.rolling(window = f, center = True).max().interpolate(method = 'index', limit_direction = 'both', limit = None)
            max_frame.columns = ['Max_{0}_{1}'.format(f, column) for column in max_frame.columns]
            min_frame = working_frame.rolling(window = f, center = True).min().interpolate(method = 'index', limit_direction = 'both', limit = None)
            min_frame.columns = ['Min_{0}_{1}'.format(f, column) for column in min_frame.columns]
            std_frame = working_frame.rolling(window = f, center = True).std().interpolate(method = 'index', limit_direction = 'both', limit = None)
            std_frame.columns = ['Std_{0}_{1}'.format(f, column) for column in std_frame.columns]
            var_frame = working_frame.rolling(window = f, center = True).var().interpolate(method = 'index', limit_direction = 'both', limit = None)
            var_frame.columns = ['Var_{0}_{1}'.format(f, column) for column in var_frame.columns]
            diff_frame = working_frame.diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None)
            diff_frame.columns = ['Diff_{0}_{1}'.format(f, column) for column in diff_frame.columns]
            rdiff_frame = working_frame.sort_index(ascending = False).diff(f, axis = 0).interpolate(method = 'index', limit_direction = 'both', limit = None).sort_index()
            rdiff_frame.columns = ['Rdiff_{0}_{1}'.format(f, column) for column in rdiff_frame.columns]
            skew_frame = working_frame.rolling(window = f, center = True).skew().interpolate(method = 'index', limit_direction = 'both', limit = None)
            skew_frame.columns = ['Skew_{0}_{1}'.format(f, column) for column in skew_frame.columns]

            f_frame = pd.concat((mean_frame, max_frame, min_frame, std_frame, var_frame, diff_frame, rdiff_frame), axis = 1)

            constructor_list.append(f_frame)

        well_frame = pd.concat(constructor_list, axis = 1)
        well_frame['Well Name'] = well
        # orig index is holding the original index locations, to make extracting the results trivial
        well_frame['orig_index'] = group['orig_index']
        df = group.sort_values('Depth')
        u = df.shift(-1).fillna(method = 'ffill')
        b = df.shift(1).fillna(method = 'bfill')
        above.append(u[div_columns])
        below.append(b[div_columns])

        mean_frames.append(well_frame.fillna(method = 'bfill').fillna(method = 'ffill'))

    frame = test
    frame.index = frame['orig_index']
    frame.drop(['orig_index', 'Well Name'], axis = 1, inplace = True)

    for f in mean_frames:
        f.index = f['orig_index']

    rolling_frame = pd.concat(mean_frames, axis = 0)
    above_frame = pd.concat(above)
    above_frame.columns = ['above_'+ column for column in above_frame.columns]
    below_frame = pd.concat(below)
    below_frame.columns = ['below_'+ column for column in below_frame.columns]
    upsampled_frame = pd.concat((frame, rolling_frame, above_frame, below_frame), axis = 1)

    features = [feature for feature in upsampled_frame.columns if 'class' not in feature]

    std_scaler = preprocessing.StandardScaler().fit(upsampled_frame[features])
    train_std = std_scaler.transform(upsampled_frame[features])

    train_std_frame = upsampled_frame
    for i, column in enumerate(features):
        train_std_frame.loc[:, column] = train_std[:, i]

    upsampled_frame_std = train_std_frame

    for feature in div_columns:
        for f in div_columns:
            if f == feature:
                continue
            upsampled_frame['{0}_{1}'.format(feature, f)] = upsampled_frame[f] / upsampled_frame[feature]
 
    return upsampled_frame_std, features

train_data_set, features = feature_extraction(train_path)
test_data_set, test_features = feature_extraction(test_path)



In [126]:

    
train_data_set.head()









    Out[126]:






  
    
      
      class
      Formation
      Depth
      GR
      ILD_log10
      DeltaPHI
      PHIND
      PE
      NM_M
      RELPOS
      ...
      NM_M_PHIND
      NM_M_PE
      NM_M_RELPOS
      RELPOS_Depth
      RELPOS_GR
      RELPOS_ILD_log10
      RELPOS_DeltaPHI
      RELPOS_PHIND
      RELPOS_PE
      RELPOS_NM_M
    
  
  
    
      0
      3
      -1.346013
      -0.632316
      0.366749
      0.088008
      1.212737
      -0.203723
      0.976532
      -0.996911
      1.672943
      ...
      0.204354
      -0.979558
      -1.678127
      -0.377966
      0.219224
      0.052606
      0.724912
      -0.121775
      0.583721
      -0.595902
    
    
      1
      3
      -1.346013
      -0.628499
      0.393005
      0.075601
      2.035209
      -0.119283
      0.418505
      -0.996911
      1.599708
      ...
      0.119652
      -0.419802
      -1.604665
      -0.392884
      0.245673
      0.047259
      1.272238
      -0.074565
      0.261613
      -0.623183
    
    
      2
      3
      -1.346013
      -0.624682
      0.418613
      0.063194
      2.149973
      -0.056278
      -0.139522
      -0.996911
      1.522985
      ...
      0.056452
      0.139955
      -1.527705
      -0.410169
      0.274863
      0.041494
      1.411683
      -0.036952
      -0.091611
      -0.654577
    
    
      3
      3
      -1.346013
      -0.620865
      0.647138
      0.050788
      1.977828
      -0.047834
      -0.251128
      -0.996911
      1.449750
      ...
      0.047982
      0.251906
      -1.454243
      -0.428256
      0.446379
      0.035032
      1.364254
      -0.032994
      -0.173221
      -0.687643
    
    
      4
      3
      -1.346013
      -0.617047
      0.273719
      0.017704
      1.901319
      -0.023801
      -0.362733
      -0.996911
      1.376515
      ...
      0.023875
      0.363857
      -1.380781
      -0.448268
      0.198849
      0.012861
      1.381255
      -0.017291
      -0.263516
      -0.724228
    
  

5 rows × 364 columns



In [127]:

    
lpgo = LeavePGroupsOut(2)

split_list = []
fitted_models = []

for train, val in lpgo.split(train_data_set[features], 
                             train_data_set['class'], 
                             groups = train_data_set['Well Name']):
    hist_tr = np.histogram(train_data_set.loc[train, 'class'], 
                           bins = np.arange(len(facies_labels) + 1) + 0.5)
    hist_val = np.histogram(train_data_set.loc[val, 'class'],
                           bins = np.arange(len(facies_labels) + 1) + 0.5)
    if np.all(hist_tr[0] != 0) & np.all(hist_val[0] != 0):
        split_list.append({'train': train, 'val': val})
        
for s, split in enumerate(split_list):
    print('Split %d' % s)
    print('    training:   %s' % (train_data_set['Well Name'].loc[split['train']].unique()))
    print('    validation: %s' % (train_data_set['Well Name'].loc[split['val']].unique()))









    



Split 0
    training:   [-0.53211804 -0.10387962  0.3243588   0.75259723  1.18083565  1.60907407]
    validation: [-1.38859488 -0.96035646]
Split 1
    training:   [-0.96035646 -0.53211804  0.3243588   0.75259723  1.18083565  1.60907407]
    validation: [-1.38859488 -0.10387962]
Split 2
    training:   [-0.96035646 -0.53211804 -0.10387962  0.75259723  1.18083565  1.60907407]
    validation: [-1.38859488  0.3243588 ]
Split 3
    training:   [-0.96035646 -0.53211804 -0.10387962  0.3243588   0.75259723  1.18083565]
    validation: [-1.38859488  1.60907407]
Split 4
    training:   [-1.38859488 -0.53211804 -0.10387962  0.3243588   1.18083565  1.60907407]
    validation: [-0.96035646  0.75259723]
Split 5
    training:   [-1.38859488 -0.53211804 -0.10387962  0.3243588   0.75259723  1.60907407]
    validation: [-0.96035646  1.18083565]
Split 6
    training:   [-1.38859488 -0.53211804 -0.10387962  0.3243588   0.75259723  1.18083565]
    validation: [-0.96035646  1.60907407]
Split 7
    training:   [-1.38859488 -0.96035646 -0.10387962  0.3243588   0.75259723  1.18083565]
    validation: [-0.53211804  1.60907407]
Split 8
    training:   [-1.38859488 -0.96035646 -0.53211804  0.3243588   1.18083565  1.60907407]
    validation: [-0.10387962  0.75259723]
Split 9
    training:   [-1.38859488 -0.96035646 -0.53211804  0.3243588   0.75259723  1.60907407]
    validation: [-0.10387962  1.18083565]
Split 10
    training:   [-1.38859488 -0.96035646 -0.53211804  0.3243588   0.75259723  1.18083565]
    validation: [-0.10387962  1.60907407]
Split 11
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  1.18083565  1.60907407]
    validation: [ 0.3243588   0.75259723]
Split 12
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.75259723  1.60907407]
    validation: [ 0.3243588   1.18083565]
Split 13
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.75259723  1.18083565]
    validation: [ 0.3243588   1.60907407]
Split 14
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.3243588   1.18083565]
    validation: [ 0.75259723  1.60907407]
Split 15
    training:   [-1.38859488 -0.96035646 -0.53211804 -0.10387962  0.3243588   0.75259723]
    validation: [ 1.18083565  1.60907407]



In [135]:

    
fitted_models = []
r = []

for i, split in enumerate(split_list):

    # Select training and validation data from current split
    X_tr = train_data_set.loc[split['train'], features]
    X_v = train_data_set.loc[split['val'], features]
    y_tr = train_data_set.loc[split['train'], 'class']
    y_v = train_data_set.loc[split['val'], 'class']

    # Fit model from split
    fitted_models.append(models[i].fit(X_tr, y_tr))
    
    # Predict for model
    r.append(fitted_models[-1].predict(test_data_set[test_features]))
    
results = mode(np.vstack(r))[0][0]

test_data_set['Facies'] = results









    



/home/slygeorge/anaconda3/envs/seg_competition/lib/python3.5/site-packages/sklearn/feature_selection/univariate_selection.py:114: RuntimeWarning: divide by zero encountered in true_divide
  f = msb / msw



In [137]:

    
test_data_set.iloc[:, ::-1].head()









    Out[137]:






  
    
      
      Facies
      RELPOS_NM_M
      RELPOS_PE
      RELPOS_PHIND
      RELPOS_DeltaPHI
      RELPOS_ILD_log10
      RELPOS_GR
      RELPOS_Depth
      NM_M_RELPOS
      NM_M_PE
      ...
      Mean_2_Depth
      RELPOS
      NM_M
      PE
      PHIND
      DeltaPHI
      ILD_log10
      GR
      Depth
      Formation
    
    
      orig_index
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      2
      -0.884952
      -0.059289
      -0.118109
      0.079374
      -0.076787
      0.191931
      -1.156836
      -1.130005
      0.066997
      ...
      -1.892984
      1.640888
      -1.452107
      -0.097287
      -0.193803
      0.130243
      -0.125999
      0.314937
      -1.898239
      -1.351854
    
    
      1
      2
      -0.928980
      -0.308522
      0.036349
      0.678435
      -0.180501
      0.456716
      -1.211001
      -1.076450
      0.332108
      ...
      -1.892984
      1.563120
      -1.452107
      -0.482257
      0.056818
      1.060476
      -0.282144
      0.713902
      -1.892939
      -1.351854
    
    
      2
      2
      -0.977618
      -0.611844
      0.252408
      1.281514
      -0.234336
      0.618819
      -1.270837
      -1.022894
      0.625852
      ...
      -1.887683
      1.485351
      -1.452107
      -0.908803
      0.374915
      1.903499
      -0.348072
      0.919164
      -1.887639
      -1.351854
    
    
      3
      2
      -1.034229
      -0.742690
      0.218967
      1.376426
      -0.181180
      0.596973
      -1.340651
      -0.966904
      0.718110
      ...
      -1.882383
      1.404048
      -1.452107
      -1.042773
      0.307440
      1.932569
      -0.254385
      0.838179
      -1.882338
      -1.351854
    
    
      4
      2
      -1.094872
      -0.736314
      0.100984
      1.281789
      -0.074072
      0.503166
      -1.415266
      -0.913348
      0.672511
      ...
      -1.877082
      1.326280
      -1.452107
      -0.976558
      0.133933
      1.700011
      -0.098240
      0.667339
      -1.877038
      -1.351854
    
  

5 rows × 364 columns



In [139]:

    
test_data_set.iloc[:, ::-1].to_csv('06 - Combined Models.csv')



In [ ]:

	class	Formation	Depth	GR	ILD_log10	DeltaPHI	PHIND	PE	NM_M	RELPOS	...	NM_M_PHIND	NM_M_PE	NM_M_RELPOS	RELPOS_Depth	RELPOS_GR	RELPOS_ILD_log10	RELPOS_DeltaPHI	RELPOS_PHIND	RELPOS_PE	RELPOS_NM_M
0	3	-1.346013	-0.632316	0.366749	0.088008	1.212737	-0.203723	0.976532	-0.996911	1.672943	...	0.204354	-0.979558	-1.678127	-0.377966	0.219224	0.052606	0.724912	-0.121775	0.583721	-0.595902
1	3	-1.346013	-0.628499	0.393005	0.075601	2.035209	-0.119283	0.418505	-0.996911	1.599708	...	0.119652	-0.419802	-1.604665	-0.392884	0.245673	0.047259	1.272238	-0.074565	0.261613	-0.623183
2	3	-1.346013	-0.624682	0.418613	0.063194	2.149973	-0.056278	-0.139522	-0.996911	1.522985	...	0.056452	0.139955	-1.527705	-0.410169	0.274863	0.041494	1.411683	-0.036952	-0.091611	-0.654577
3	3	-1.346013	-0.620865	0.647138	0.050788	1.977828	-0.047834	-0.251128	-0.996911	1.449750	...	0.047982	0.251906	-1.454243	-0.428256	0.446379	0.035032	1.364254	-0.032994	-0.173221	-0.687643
4	3	-1.346013	-0.617047	0.273719	0.017704	1.901319	-0.023801	-0.362733	-0.996911	1.376515	...	0.023875	0.363857	-1.380781	-0.448268	0.198849	0.012861	1.381255	-0.017291	-0.263516	-0.724228

	Facies	RELPOS_NM_M	RELPOS_PE	RELPOS_PHIND	RELPOS_DeltaPHI	RELPOS_ILD_log10	RELPOS_GR	RELPOS_Depth	NM_M_RELPOS	NM_M_PE	...	Mean_2_Depth	RELPOS	NM_M	PE	PHIND	DeltaPHI	ILD_log10	GR	Depth	Formation
orig_index
0	2	-0.884952	-0.059289	-0.118109	0.079374	-0.076787	0.191931	-1.156836	-1.130005	0.066997	...	-1.892984	1.640888	-1.452107	-0.097287	-0.193803	0.130243	-0.125999	0.314937	-1.898239	-1.351854
1	2	-0.928980	-0.308522	0.036349	0.678435	-0.180501	0.456716	-1.211001	-1.076450	0.332108	...	-1.892984	1.563120	-1.452107	-0.482257	0.056818	1.060476	-0.282144	0.713902	-1.892939	-1.351854
2	2	-0.977618	-0.611844	0.252408	1.281514	-0.234336	0.618819	-1.270837	-1.022894	0.625852	...	-1.887683	1.485351	-1.452107	-0.908803	0.374915	1.903499	-0.348072	0.919164	-1.887639	-1.351854
3	2	-1.034229	-0.742690	0.218967	1.376426	-0.181180	0.596973	-1.340651	-0.966904	0.718110	...	-1.882383	1.404048	-1.452107	-1.042773	0.307440	1.932569	-0.254385	0.838179	-1.882338	-1.351854
4	2	-1.094872	-0.736314	0.100984	1.281789	-0.074072	0.503166	-1.415266	-0.913348	0.672511	...	-1.877082	1.326280	-1.452107	-0.976558	0.133933	1.700011	-0.098240	0.667339	-1.877038	-1.351854